The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Object recognition
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# various scaling algorithms
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split # to split the dataset for training and testing
from sklearn.model_selection import cross_val_score # for performing cross validation of models
from sklearn.svm import SVC # Support vector machine model
from sklearn.utils import shuffle
from sklearn import metrics # to get various evaluation metrics
from sklearn.metrics import roc_auc_score # receiver operating curve score
from sklearn.metrics import accuracy_score # accuracy of prediction score
from sklearn.metrics import recall_score # recall score
from sklearn.metrics import precision_score # precision score
from sklearn.metrics import f1_score # f1 score
from sklearn.decomposition import PCA # performing principal components analysis
# read the dataset to a dataframe
df = pd.read_csv('vehicle.csv')
# taking a look at the first 10 rows of the dataframe, already seeing some missing values
df.head(10)
df.info() # basic info such as datatype, value types
df.shape # dimensions of the dataframe
# To label encode our target variable
encoder = LabelEncoder()
encoder.fit(df['class'])
df['class'] = encoder.transform(df['class'])
# To check for missing or null values
df.isna().sum()
#To impute missing values
impute = SimpleImputer( missing_values= np.nan , strategy='median')
#impute = impute.fit(df[:,0:18])
df.iloc[:,0:18] = impute.fit_transform(df.iloc[:,0:18])
# separating the dataframe into three dataframes based on target variable
#bus = df[df['class']==0]
#car = df[df['class']==1]
#van = df[df['class']==2]
# using median to fill the null or missing values of each column based on each vehicle
#medianFiller = lambda x: x.fillna(x.median())
#bus = bus.apply(medianFiller, axis=0)
#car = car.apply(medianFiller, axis=0)
#van = van.apply(medianFiller, axis=0)
#truncating the original dataframe
#df.drop(df.index, inplace=True)
#appending the three dataframes to the original dataframe after filling missing or null values
#df = df.append(bus).append(car).append(van)
#df.sample(frac=1).reset_index(drop= True)
#df = df.sample(frac=1)
# Replacing the null values with median values of proper class vehicle
#i = 0
#while i < len(missing_cols):
# df[df[missing_cols[i]].isna() == True].loc[df['class'] == 0, missing_cols[i]].replace(np.nan,df[df[missing_cols[i]].isna() == False].loc[df['class'] == 0, missing_cols[i]].median())
# df[df[missing_cols[i]].isna() == True].loc[df['class'] == 1, missing_cols[i]].replace(np.nan,df[df[missing_cols[i]].isna() == False].loc[df['class'] == 1, missing_cols[i]].median())
# df[df[missing_cols[i]].isna() == True].loc[df['class'] == 2, missing_cols[i]].replace(np.nan,df[df[missing_cols[i]].isna() == False].loc[df['class'] == 2, missing_cols[i]].median())
# i += 1
# Taking a look at the missing or null values, no more present after median filling data
df.isna().sum()
# just to check the new df in csv file
df.to_csv('check1.csv')
# to shuffle the rows of the dataset after appending to get mixed rows
#df = shuffle(df)
#df.reset_index(drop = True, inplace = True) # reset index after shuffle
df['class'] = df['class'].astype('category')
df.info()
#numeric_cols = df.select_dtypes(include=[np.int64, np.float64]).columns
#numeric_cols
#df[numeric_cols] = df[numeric_cols].apply(zscore)
# Doing a boxplot on the whole data to check
sns.boxplot( data=df, orient= "h" )
plt.figure(figsize = (18,132))
cols = df.columns.values
i=0 # column counter
j=1 # plot counter
k=1 # plot counter of each variable
while i < (len(cols) - 1):
if k == 1:
plt.subplot(18,3,j)
sns.distplot(df[df['class'] == 0][cols[i]], color= 'black', label='Bus')
sns.distplot(df[df['class'] == 1][cols[i]], color = 'orange', label='Car')
sns.distplot(df[df['class'] == 2][cols[i]], color = 'red', label='Van')
j+=1
k+=1
plt.title(f'Distribution of {cols[i]}')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
plt.legend()
elif k == 2:
plt.subplot(18,3,j)
sns.distplot(df[cols[i]])
j+=1
k+=1
plt.title(f'Distribution of {cols[i]}')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
else:
plt.subplot(18,3,j)
sns.boxplot(df[cols[i]], color='yellowgreen')
j+=1
q1, q3 = np.percentile(df[cols[i]],[25,75])
IQR = q3 - q1
plt.title(f'Boxplot of {cols[i]} \n \u03bc = {round(df[cols[i]].mean(), 3)}, SE = {round(df[cols[i]].std(),4)}, Median = {round(df[cols[i]].median(),3)}, IQR = {round(IQR, 3)} ')
plt.xlabel(f'{cols[i]}')
plt.ylabel('Frequency')
i+=1
k=1
sns.pairplot(df, diag_kind='kde')
colormap = plt.cm.magma
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), linewidths=0.1, vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
Since our objective is to reocgnize whether an object is a van or bus or car based on some input features, our main assumption is that there is little or no multicollinearity between the features. if our dataset has perfectly positive or negative attributes as can be obseverd from our correlation analysis, there is a high chance that the performance of the model will be impacted by a problem called — “Multicollinearity”. Multicollinearity happens when one predictor variable in a multiple regression model can be linearly predicted from the others with a high degree of accuracy. This can lead to skewed or misleading results.
If two features is highly correlated then there is no point using both features. In such cases, we can drop one of the two correlated feature or transform the two correlated features into a third feature and drop the original two features.
From above correlation matrix we can see that there are many features which are highly correlated. if we carefully analyse, we will find that many features are there which having more than 0.8 correlation, so we can decide to get rid of those columns whose correlation is +-0.8 or above.There are 8 such columns:
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X , y ,test_size = 0.30 , random_state = 1, stratify = y)
scale = StandardScaler() # Standard scaling
scale.fit(X_train.loc[:,:])# fitting on training data so the data integrity in test is maintained
X_train_scaled = scale.transform(X_train.loc[:,:])
X_test_scaled = scale.transform(X_test.loc[:,:])
X_train.loc[:,:] = X_train_scaled[:,:]
X_test.loc[:,:] = X_test_scaled[:,:]
X_train.head()
def draw_confusionmatrix(y_test, y_pred, cat1, cat2, cat3, dataset ):
cm = metrics.confusion_matrix( y_test, y_pred, [0,1,2] )
print("Confusion Matrix For : \n " ,dataset)
print(cm)
sns.heatmap(cm, annot=True, fmt='.1f' ,xticklabels = [cat1, cat2,cat3] , yticklabels = [cat1, cat2,cat3] )
plt.ylabel('True Vehicle')
plt.xlabel('Predicted Vehicle')
plt.show()
#using SVM to train model
svc = SVC( C=1, gamma='auto')
svc.fit(X_train, y_train)
print("Accuracy on training set: {:.6f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.6f}".format(svc.score(X_test, y_test)))
y_predict = svc.predict(X_test)
# Getting the confusion matrix
draw_confusionmatrix(y_test, y_predict,"Bus", "Car", "Van", "Scaled Data Set" )
print(metrics.classification_report(y_test,y_predict))
The model correctly predicted 63 buses, 123 cars and 58 vans. It incorrectly predicted two buses as vans, six cars as vans and two vans as a car and a bus each.
#Perform cross validation
models = []
models.append(('P01', 0.1, 'poly'))
models.append(('P1', 1, 'poly'))
models.append(('P10', 10, 'poly'))
models.append(('P20', 20, 'poly'))
models.append(('P30', 30, 'poly'))
models.append(('S01', 0.1, 'sigmoid'))
models.append(('S1', 1, 'sigmoid'))
models.append(('S10', 10, 'sigmoid'))
models.append(('S20', 20, 'sigmoid'))
models.append(('S30', 30, 'sigmoid'))
models.append(('R01', 0.1, 'rbf'))
models.append(('R1', 10, 'rbf'))
models.append(('R10', 10, 'rbf'))
models.append(('R20', 20, 'rbf'))
models.append(('R30', 30, 'rbf'))
# evaluate each model in turn
results = []
names = []
scale2 = StandardScaler() # Standard scaling
scale2.fit(X.loc[:,:])# fitting on training data so the data integrity in test is maintained
X_scaled = scale2.transform(X.loc[:,:])
X_scaled
kfold = KFold( n_splits=10, shuffle=True, random_state=1)
for name, c, kernels in models:
cv_results = cross_val_score( SVC( C=c ,kernel = kernels, gamma = 'auto' ), X_scaled, y, cv=kfold, scoring= 'accuracy')
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print('Accuracy of ' + msg)
#using SVM to train model
svc = SVC( C=10, gamma='auto')
svc.fit(X_train, y_train)
print("Accuracy on training set: {:.6f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.6f}".format(svc.score(X_test, y_test)))
y_predict = svc.predict(X_test)
# Getting the confusion matrix
draw_confusionmatrix(y_test, y_predict,"Bus", "Car", "Van", "Scaled Data Set" )
print(metrics.classification_report(y_test,y_predict))
After doing cross validation after tuning the hyperparameters, the SVM with rbf kernel and C = 10 looks good, with gamma set to auto. The model correctly predicted 63 buses, 125 cars and 58 vans. It incorrectly predicted two buses as vans, four cars as vans and two vans as cars.
#should give an 18*18 matrix
covMatrix = np.cov(X_scaled,rowvar=False)
print(covMatrix)
#Performing PCA on all of its 18 predictors
pca = PCA(n_components=18)
pca.fit(X_scaled)
#Eigen Values or the variance explained by each of the components
print(pca.explained_variance_)
#Eigen vectors
print(pca.components_)
# To find the percentage of variance explained by each of the components / predictors
print(pca.explained_variance_ratio_)
# To find the cumulative sum of variances explained by the components
print(np.cumsum(pca.explained_variance_ratio_))
From the cumulative sum above , it looks like 95% of the variance is explained by the first 7 components. Therefore 7 dimensions are enough.
sns.pairplot( pd.DataFrame(pca.transform(X_scaled)), diag_kind='kde')
plt.bar(range(1,19), pca.explained_variance_ratio_, alpha = 0.5, align='center', label='individual explained variance')
plt.step( range(1,19), np.cumsum(pca.explained_variance_ratio_), where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
pca2 = PCA(n_components=7)
pca2.fit(X_scaled)
print(pca2.components_)
print(pca2.explained_variance_ratio_)
# getting the dataset of the reduced set of dimensions that we will feed to SVM later on
Xpca =pca2.transform(X_scaled)
# Checking the cumulative variance
print(np.cumsum(pca2.explained_variance_ratio_))
# Xpca is an ndarray, need to convert to a dataframe
Xdf = pd.DataFrame(Xpca)
Xdf
sns.pairplot(Xdf, diag_kind='kde')
After dimensionality reduction using PCA our attributes have become independent with no correlation among themselves. As most of them have cloud of data points with no lienaer kind of relationship.
plt.bar(range(1,8), pca2.explained_variance_ratio_, alpha = 0.5, align='center', label='individual explained variance')
plt.step( range(1,8), np.cumsum(pca2.explained_variance_ratio_), where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split( Xdf, y, test_size=0.30, random_state=1, stratify = y)
#using SVM to train model
svc = SVC( C=10, gamma='auto')
svc.fit(pca_X_train, pca_y_train)
print("Accuracy on training set: {:.6f}".format(svc.score(pca_X_train, pca_y_train)))
print("Accuracy on test set: {:.6f}".format(svc.score(pca_X_test, pca_y_test)))
pca_y_predict = svc.predict(pca_X_test)
# Getting the confusion matrix
draw_confusionmatrix(pca_y_test, pca_y_predict,"Bus", "Car", "Van", "PCA Reduced Data Set" )
print(metrics.classification_report(pca_y_test,pca_y_predict))
After running the SVM model on the PCA reduced set of scaled data, the model correctly predicted 60 buses, 115 cars and 54 vans. It incorrectly predicted two buses as vans, three buses as cars, ten cars as vans, four cars as buss and six vans as cars.
#Perform cross validation
models2 = []
models2.append(('P01', 0.1, 'poly'))
models2.append(('P1', 1, 'poly'))
models2.append(('P10', 10, 'poly'))
models2.append(('P20', 20, 'poly'))
models2.append(('P30', 30, 'poly'))
models2.append(('S01', 0.1, 'sigmoid'))
models2.append(('S1', 1, 'sigmoid'))
models2.append(('S10', 10, 'sigmoid'))
models2.append(('S20', 20, 'sigmoid'))
models2.append(('S30', 30, 'sigmoid'))
models2.append(('R01', 0.1, 'rbf'))
models2.append(('R1', 10, 'rbf'))
models2.append(('R10', 10, 'rbf'))
models2.append(('R20', 20, 'rbf'))
models2.append(('R30', 30, 'rbf'))
# evaluate each model in turn
results2 = []
names2 = []
kfold = KFold( n_splits=10, shuffle=True, random_state=1)
for name, c, kernels in models2:
cv_results2 = cross_val_score( SVC( C=c ,kernel = kernels, gamma = 'auto' ), Xdf, y, cv=kfold, scoring= 'accuracy')
results2.append(cv_results2)
names2.append(name)
msg = "%s: %f (%f)" % (name, cv_results2.mean(), cv_results2.std())
print('Accuracy of ' + msg)
#using SVM to train model
svc2 = SVC( C=10, gamma='auto')
svc2.fit(pca_X_train, pca_y_train)
print("Accuracy on training set: {:.6f}".format(svc2.score(pca_X_train, pca_y_train)))
print("Accuracy on test set: {:.6f}".format(svc2.score(pca_X_test, pca_y_test)))
pca_y_predict = svc2.predict(pca_X_test)
print(accuracy_score(pca_y_test,pca_y_predict))
# Getting the confusion matrix
draw_confusionmatrix(pca_y_test, pca_y_predict,"Bus", "Car", "Van", "PCA Reduced Data Set" )
print(metrics.classification_report(pca_y_test,pca_y_predict))
After doing cross validation after tuning the hyperparameters, the SVM with rbf kernel and C = 10 looks good, with gamma set to auto. The model correctly predicted 60 buses, 115 cars and 54 vans. It incorrectly predicted two buses as vans, three buses as cars, four cars as buses, ten cars as vans and six vans as cars.
df_comp = pd.DataFrame(pca2.components_,columns=list(X.columns.values))
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma')